This is my Exam 3 document
Lets load the data and take a look at it.
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.3
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(gganimate)
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
data<- read.csv(file = "EXAMS/Exam_3/BioLogData_Exam3.csv", sep = "|")
summary(data)
## Sample.ID Rep Well Dilution
## Clear_Creek:288 Min. :1 A1 : 36 Min. :0.001
## Soil_1 :288 1st Qu.:1 A2 : 36 1st Qu.:0.001
## Soil_2 :288 Median :2 A3 : 36 Median :0.010
## Waste_Water:288 Mean :2 A4 : 36 Mean :0.037
## 3rd Qu.:3 B1 : 36 3rd Qu.:0.100
## Max. :3 B2 : 36 Max. :0.100
## (Other):936
## Substrate Hr_24 Hr_48
## 2-Hydroxy Benzoic Acid : 36 Min. :0.0000 Min. :0.0000
## 4-Hydroxy Benzoic Acid : 36 1st Qu.:0.0000 1st Qu.:0.0060
## D-Cellobiose : 36 Median :0.0320 Median :0.2595
## D-Galactonic Acid γ-Lactone: 36 Mean :0.1703 Mean :0.4691
## D-Galacturonic Acid : 36 3rd Qu.:0.1872 3rd Qu.:0.7220
## D-Glucosaminic Acid : 36 Max. :2.6500 Max. :2.7850
## (Other) :936
## Hr_144
## Min. :0.00000
## 1st Qu.:0.04175
## Median :0.75200
## Mean :0.92497
## 3rd Qu.:1.67950
## Max. :3.11600
##
Lets do some exploratory analysis
pairs(data)
class(data$Sample.ID)
## [1] "factor"
class(data$Rep)
## [1] "integer"
class(data$Well)
## [1] "factor"
class(data$Dilution)
## [1] "numeric"
class(data$Substrate)
## [1] "factor"
class(data$Hr_24)
## [1] "numeric"
class(data$Hr_48)
## [1] "numeric"
class(data$Hr_144)
## [1] "numeric"
Some regressions models and summary stats.
a<- lm(formula = Dilution ~ Hr_24, data = data)
summary(a)
##
## Call:
## lm(formula = Dilution ~ Hr_24, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.03664 -0.03607 -0.02750 0.06237 0.06787
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.037644 0.001497 25.146 <2e-16 ***
## Hr_24 -0.003784 0.004173 -0.907 0.365
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04472 on 1150 degrees of freedom
## Multiple R-squared: 0.0007146, Adjusted R-squared: -0.0001544
## F-statistic: 0.8223 on 1 and 1150 DF, p-value: 0.3647
b<- lm(formula = Dilution ~ Hr_48, data = data)
summary(b)
##
## Call:
## lm(formula = Dilution ~ Hr_48, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.03713 -0.03571 -0.02745 0.06198 0.06650
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.038127 0.001710 22.296 <2e-16 ***
## Hr_48 -0.002403 0.002324 -1.034 0.301
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04472 on 1150 degrees of freedom
## Multiple R-squared: 0.0009286, Adjusted R-squared: 5.981e-05
## F-statistic: 1.069 on 1 and 1150 DF, p-value: 0.3014
c<- lm(formula= Dilution ~ Hr_144, data = data)
summary(c)
##
## Call:
## lm(formula = Dilution ~ Hr_144, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.04068 -0.03168 -0.02651 0.05956 0.07303
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.041682 0.001923 21.68 < 2e-16 ***
## Hr_144 -0.005062 0.001520 -3.33 0.000896 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04452 on 1150 degrees of freedom
## Multiple R-squared: 0.00955, Adjusted R-squared: 0.008689
## F-statistic: 11.09 on 1 and 1150 DF, p-value: 0.0008963
Hr 144 is the most significant to the Dilution factor.
hist(data$Dilution)
hist(data$Hr_144)
hist(data$Hr_48)
hist(data$Hr_24)
names(data)
## [1] "Sample.ID" "Rep" "Well" "Dilution" "Substrate" "Hr_24"
## [7] "Hr_48" "Hr_144"
ggplot(data,aes(x=data$Dilution,y=data$Substrate)) +
geom_boxplot() + facet_wrap(~Sample.ID)
fig1<-ggplot(data,aes(x=data$Hr_24,fill= Sample.ID)) +
geom_histogram()
fig2<-ggplot(data,aes(x=data$Hr_24,fill= Substrate)) +
geom_histogram()
ggplotly(fig1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(fig2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
fig3<-ggplot(data,aes(x=data$Hr_48,fill= Sample.ID)) +
geom_histogram()
fig4<-ggplot(data,aes(x=data$Hr_48,fill= Substrate)) +
geom_histogram()
fig5<-ggplot(data,aes(x=data$Hr_144,fill= Sample.ID)) +
geom_histogram()
fig6<-ggplot(data,aes(x=data$Hr_144,fill= Substrate)) +
geom_histogram()
ggplotly(fig3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(fig4)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(fig5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(fig6)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.